Still under constructions.

(I) Background

Name of Terms Name of Variables
Rank movie_rank
Link movie_link
Title movie_title
Year movie_year
Content Rating movie_content_rating
User Rating movie_user_rating
Number of Rater movie_num_rater
Genre movie_genre
Budget ($) movie_budget
Opening Weekend USA ($) movie_opening
Gross USA ($) movie_gross
Cumulative Worldwide Gross ($) movie_worldwide_gross

(II) Creating general list of IMDb Top Rated Movies

1. Read main page source code

main_url = "http://www.imdb.com/chart/top?ref_=ft_250"

# Read source code from the main page
main_source_code = readLines(con = main_url, encoding = "UTF-8")

2. Get 5 terms

# Get each movie's rank, link, title, year and user rating
# Locator: <td class="titleColumn">
#   line + 1: rank
#   line + 2: link
#   line + 3: title
#   line + 4: year
#   line + 7: user rating
main_locator_pattern = "<td class=\"titleColumn\">"
main_locator_pattern_lines = main_locator_pattern %>%
  grep(main_source_code)

movie_rank = main_source_code %>%
  extract(main_locator_pattern_lines + 1) %>%
  str_remove_all(" ") %>%
  str_remove("\\.")

movie_link = main_source_code %>%
  extract(main_locator_pattern_lines + 2) %>%
  str_split("href=\"") %>%
  sapply(extract, 2) %>%
  strsplit("\\?") %>%
  sapply(extract, 1) %>%
  paste0("https://www.imdb.com", .)

movie_title = main_source_code %>%
  extract(main_locator_pattern_lines + 3) %>%
  strsplit(">") %>%
  sapply(extract, 2) %>%
  strsplit("<") %>%
  sapply(extract, 1)

movie_year = main_source_code %>%
  extract(main_locator_pattern_lines + 4) %>%
  strsplit("\\(") %>%
  sapply(extract, 2) %>%
  strsplit("\\)") %>%
  sapply(extract, 1)

movie_user_rating = main_source_code %>%
  extract(main_locator_pattern_lines + 7) %>%
  strsplit(">") %>%
  sapply(extract, 2) %>%
  strsplit("<") %>%
  sapply(extract, 1)

3. Main page visualization

# Visualization
main_page = tibble(Rank = movie_rank,
                   Title = movie_title,
                   Year = movie_year,
                   `User Rating` = movie_user_rating)
main_page %>%
  kable(align = "c") %>%
  kable_styling(full_width = FALSE)
Rank Title Year User Rating
1 The Shawshank Redemption 1994 9.2
2 The Godfather 1972 9.1
3 The Godfather: Part II 1974 9.0
4 The Dark Knight 2008 9.0
5 12 Angry Men 1957 8.9
6 Schindler’s List 1993 8.9
7 The Lord of the Rings: The Return of the King 2003 8.9
8 Pulp Fiction 1994 8.8
9 The Good, the Bad and the Ugly 1966 8.8
10 The Lord of the Rings: The Fellowship of the Ring 2001 8.8
11 Fight Club 1999 8.8
12 Forrest Gump 1994 8.8
13 Inception 2010 8.7
14 The Lord of the Rings: The Two Towers 2002 8.7
15 Star Wars: Episode V - The Empire Strikes Back 1980 8.7
16 The Matrix 1999 8.6
17 Goodfellas 1990 8.6
18 One Flew Over the Cuckoo’s Nest 1975 8.6
19 Seven Samurai 1954 8.6
20 Se7en 1995 8.6
21 Life Is Beautiful 1997 8.6
22 City of God 2002 8.6
23 The Silence of the Lambs 1991 8.6
24 It’s a Wonderful Life 1946 8.6
25 Star Wars: Episode IV - A New Hope 1977 8.6
26 Saving Private Ryan 1998 8.5
27 Spirited Away 2001 8.5
28 The Green Mile 1999 8.5
29 Parasite 2019 8.5
30 Interstellar 2014 8.5
31 Léon: The Professional 1994 8.5
32 The Usual Suspects 1995 8.5
33 Harakiri 1962 8.5
34 The Lion King 1994 8.5
35 Back to the Future 1985 8.5
36 The Pianist 2002 8.5
37 Terminator 2: Judgment Day 1991 8.5
38 American History X 1998 8.5
39 Modern Times 1936 8.5
40 Psycho 1960 8.5
41 Gladiator 2000 8.5
42 City Lights 1931 8.5
43 The Departed 2006 8.5
44 The Intouchables 2011 8.5
45 Whiplash 2014 8.5
46 Hamilton 2020 8.5
47 The Prestige 2006 8.5
48 Grave of the Fireflies 1988 8.5
49 Once Upon a Time in the West 1968 8.4
50 Casablanca 1942 8.4
51 Cinema Paradiso 1988 8.4
52 Rear Window 1954 8.4
53 Alien 1979 8.4
54 Apocalypse Now 1979 8.4
55 Memento 2000 8.4
56 Raiders of the Lost Ark 1981 8.4
57 The Great Dictator 1940 8.4
58 Django Unchained 2012 8.4
59 The Lives of Others 2006 8.4
60 Joker 2019 8.4
61 Paths of Glory 1957 8.4
62 WALL·E 2008 8.4
63 The Shining 1980 8.4
64 Avengers: Infinity War 2018 8.4
65 Sunset Blvd. 1950 8.4
66 Witness for the Prosecution 1957 8.4
67 Spider-Man: Into the Spider-Verse 2018 8.4
68 Oldboy 2003 8.4
69 Princess Mononoke 1997 8.4
70 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb 1964 8.3
71 The Dark Knight Rises 2012 8.3
72 Once Upon a Time in America 1984 8.3
73 Aliens 1986 8.3
74 Your Name. 2016 8.3
75 Avengers: Endgame 2019 8.3
76 Coco 2017 8.3
77 American Beauty 1999 8.3
78 Braveheart 1995 8.3
79 3 Idiots 2009 8.3
80 Das Boot 1981 8.3
81 Toy Story 1995 8.3
82 High and Low 1963 8.3
83 Amadeus 1984 8.3
84 Capharnaüm 2018 8.3
85 Taare Zameen Par 2007 8.3
86 Inglourious Basterds 2009 8.3
87 Star Wars: Episode VI - Return of the Jedi 1983 8.3
88 Reservoir Dogs 1992 8.3
89 Good Will Hunting 1997 8.3
90 2001: A Space Odyssey 1968 8.3
91 Requiem for a Dream 2000 8.3
92 Vertigo 1958 8.3
93 M 1931 8.3
94 Eternal Sunshine of the Spotless Mind 2004 8.3
95 Dangal 2016 8.3
96 The Hunt 2012 8.3
97 Citizen Kane 1941 8.3
98 1917 2019 8.3
99 Full Metal Jacket 1987 8.2
100 Bicycle Thieves 1948 8.2
101 The Kid 1921 8.2
102 Singin’ in the Rain 1952 8.2
103 A Clockwork Orange 1971 8.2
104 North by Northwest 1959 8.2
105 Snatch 2000 8.2
106 Scarface 1983 8.2
107 Taxi Driver 1976 8.2
108 Ikiru 1952 8.2
109 Lawrence of Arabia 1962 8.2
110 Amélie 2001 8.2
111 Toy Story 3 2010 8.2
112 The Sting 1973 8.2
113 Metropolis 1927 8.2
114 A Separation 2011 8.2
115 Incendies 2010 8.2
116 For a Few Dollars More 1965 8.2
117 Come and See 1985 8.2
118 The Apartment 1960 8.2
119 Double Indemnity 1944 8.2
120 To Kill a Mockingbird 1962 8.2
121 Up 2009 8.2
122 Indiana Jones and the Last Crusade 1989 8.2
123 L.A. Confidential 1997 8.2
124 Heat 1995 8.2
125 Die Hard 1988 8.2
126 Monty Python and the Holy Grail 1975 8.2
127 Rashômon 1950 8.2
128 Yojimbo 1961 8.2
129 Batman Begins 2005 8.2
130 Green Book 2018 8.2
131 Downfall 2004 8.2
132 Children of Heaven 1997 8.2
133 Unforgiven 1992 8.2
134 Ran 1985 8.2
135 Some Like It Hot 1959 8.2
136 Howl’s Moving Castle 2004 8.2
137 A Beautiful Mind 2001 8.2
138 All About Eve 1950 8.2
139 Casino 1995 8.2
140 The Great Escape 1963 8.2
141 The Wolf of Wall Street 2013 8.2
142 Pan’s Labyrinth 2006 8.2
143 Anand 1971 8.2
144 The Secret in Their Eyes 2009 8.1
145 Lock, Stock and Two Smoking Barrels 1998 8.1
146 Raging Bull 1980 8.1
147 My Neighbor Totoro 1988 8.1
148 There Will Be Blood 2007 8.1
149 Judgment at Nuremberg 1961 8.1
150 The Treasure of the Sierra Madre 1948 8.1
151 Three Billboards Outside Ebbing, Missouri 2017 8.1
152 Dial M for Murder 1954 8.1
153 Chinatown 1974 8.1
154 The Gold Rush 1925 8.1
155 Babam ve Oglum 2005 8.1
156 Shutter Island 2010 8.1
157 No Country for Old Men 2007 8.1
158 V for Vendetta 2005 8.1
159 The Seventh Seal 1957 8.1
160 Inside Out 2015 8.1
161 Vikram Vedha 2017 8.1
162 Warrior 2011 8.1
163 The Elephant Man 1980 8.1
164 The Thing 1982 8.1
165 The Sixth Sense 1999 8.1
166 Trainspotting 1996 8.1
167 Jurassic Park 1993 8.1
168 Gone with the Wind 1939 8.1
169 The Truman Show 1998 8.1
170 Wild Strawberries 1957 8.1
171 Finding Nemo 2003 8.1
172 Blade Runner 1982 8.1
173 Stalker 1979 8.1
174 Kill Bill: Vol. 1 2003 8.1
175 Room 2015 8.1
176 The Bridge on the River Kwai 1957 8.1
177 Fargo 1996 8.1
178 Memories of Murder 2003 8.1
179 Tokyo Story 1953 8.1
180 The Third Man 1949 8.1
181 Gran Torino 2008 8.1
182 On the Waterfront 1954 8.1
183 Wild Tales 2014 8.1
184 The Deer Hunter 1978 8.1
185 Klaus 2019 8.1
186 In the Name of the Father 1993 8.1
187 Mary and Max 2009 8.1
188 Gone Girl 2014 8.1
189 The Grand Budapest Hotel 2014 8.1
190 Hacksaw Ridge 2016 8.1
191 Andhadhun 2018 8.1
192 Before Sunrise 1995 8.1
193 Catch Me If You Can 2002 8.1
194 The Big Lebowski 1998 8.1
195 Persona 1966 8.1
196 To Be or Not to Be 1942 8.1
197 The Bandit 1996 8.1
198 Prisoners 2013 8.1
199 Sherlock Jr.  1924 8.1
200 The General 1926 8.1
201 How to Train Your Dragon 2010 8.1
202 Ford v Ferrari 2019 8.1
203 Mr. Smith Goes to Washington 1939 8.1
204 12 Years a Slave 2013 8.1
205 Barry Lyndon 1975 8.1
206 Mad Max: Fury Road 2015 8.1
207 Million Dollar Baby 2004 8.1
208 Stand by Me 1986 8.1
209 Network 1976 8.1
210 Cool Hand Luke 1967 8.1
211 Dead Poets Society 1989 8.1
212 Ben-Hur 1959 8.1
213 Hachi: A Dog’s Tale 2009 8.1
214 Harry Potter and the Deathly Hallows: Part 2 2011 8.1
215 Platoon 1986 8.1
216 Into the Wild 2007 8.1
217 Logan 2017 8.1
218 The Wages of Fear 1953 8.0
219 Monty Python’s Life of Brian 1979 8.0
220 Rush 2013 8.0
221 The Handmaiden 2016 8.0
222 The Passion of Joan of Arc 1928 8.0
223 The 400 Blows 1959 8.0
224 Andrei Rublev 1966 8.0
225 Hotel Rwanda 2004 8.0
226 Spotlight 2015 8.0
227 Amores Perros 2000 8.0
228 Rififi 1955 8.0
229 La Haine 1995 8.0
230 Nausicaä of the Valley of the Wind 1984 8.0
231 Rocky 1976 8.0
232 Gangs of Wasseypur 2012 8.0
233 Monsters, Inc.  2001 8.0
234 Rebecca 1940 8.0
235 Rang De Basanti 2006 8.0
236 Before Sunset 2004 8.0
237 Portrait of a Lady on Fire 2019 8.0
238 In the Mood for Love 2000 8.0
239 Paris, Texas 1984 8.0
240 It Happened One Night 1934 8.0
241 Drishyam 2015 8.0
242 The Invisible Guest 2016 8.0
243 The Help 2011 8.0
244 The Princess Bride 1987 8.0
245 The Battle of Algiers 1966 8.0
246 The Circus 1928 8.0
247 The Terminator 1984 8.0
248 Aladdin 1992 8.0
249 Tangerines 2013 8.0
250 A Silent Voice: The Movie 2016 8.0
movie_photo = main_source_code %>%
  extract(main_locator_pattern_lines - 2) %>%
  strsplit("> ") %>%
  sapply(extract, 2) %>%
  paste0("</img>")
cat(movie_photo[1])
<img src="https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_UY67_CR0,0,45,67_AL_.jpg" width="45" height="67" alt="The Shawshank Redemption"/></img>

The Shawshank Redemption

(III) Creating detailed list of IMDb Top Rated Movies

Target Regular Expression
Title h1 itemprop="name"
Year Next line of Title
Content Rating meta itemprop="contentRating"
User Rating span itemprop="ratingValue"
Number of Rater itemprop="ratingCount"
Genre span class="itemprop" itemprop="genre"
Budget <h4 class="inline">Budget
Opening Weekend USA ($) <h4 class="inline">Opening Weekend USA
Gross USA ($) <h4 class="inline">Gross
Cumulative Worldwide Gross ($) <h4 class="inline">Cumulative
#Design function to get target information from a single page
#Each input is a website link from `movie_link`
get.target.info=function(input){
  temp=readLines(con=input,encoding="UTF-8")
  
  #1. title----
  temp.movie_title=temp[grep("h1 itemprop=\"name\"",temp)]
  temp.movie_title=strsplit(temp.movie_title,split=">")[[1]][2]
  temp.movie_title=strsplit(temp.movie_title,split="&")[[1]][1]
  
  #2. year----
  temp.movie_year=temp[grep("h1 itemprop=\"name\"",temp)+1]
  temp.movie_year=strsplit(temp.movie_year,split=">")[[1]][2]
  temp.movie_year=strsplit(temp.movie_year,split="<")[[1]][1]
  
  #3. content rating----
  temp.movie_content_rating=temp[grep("meta itemprop=\"contentRating\"",temp)]
  if (length(temp.movie_content_rating)==1){
    temp.movie_content_rating=strsplit(temp.movie_content_rating,split=">")[[1]][2]
  }
  if (length(temp.movie_content_rating)==0){
    temp.movie_content_rating="-"
  }
  
  #4. user rating----
  temp.movie_user_rating=temp[grep("span itemprop=\"ratingValue\"",temp)]
  temp.movie_user_rating=strsplit(temp.movie_user_rating,split=">")[[1]][3]
  temp.movie_user_rating=strsplit(temp.movie_user_rating,split="<")[[1]][1]
  
  #5. number of rater----
  temp.movie_num_rater=temp[grep("itemprop=\"ratingCount\"",temp)]
  temp.movie_num_rater=strsplit(temp.movie_num_rater,split=">")[[1]][3]
  temp.movie_num_rater=strsplit(temp.movie_num_rater,split="<")[[1]][1]
  
  #6. genre----
  temp.movie_genre=temp[grep("span class=\"itemprop\" itemprop=\"genre\"",temp)]
  temp.movie_genre.l=length(temp.movie_genre)
  for (i in 1:temp.movie_genre.l){
    temp.movie_genre[[i]]=strsplit(temp.movie_genre,split=">")[[i]][3]
    temp.movie_genre[[i]]=strsplit(temp.movie_genre,split="<")[[i]][1]
  }
  remove(i,temp.movie_genre.l)
  temp.movie_genre=paste(temp.movie_genre,collapse=", ")
  
  #7. budget----
  temp.movie_budget=temp[grep("<h4 class=\"inline\">Budget",temp)]
  if (length(temp.movie_budget)==1){
    temp.movie_budget=strsplit(temp.movie_budget,split=">")[[1]][3]
    a=strsplit(temp.movie_budget,split="")[[1]]
    if (paste(a[1],a[2],a[3],sep="")=="FRF"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="JPY"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="INR"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="DEM"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="RUR"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="TRL"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="AUD"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="KRW"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],a[4],a[5],a[6],sep="")=="&euro;"){
      temp.movie_budget=paste("EUR",substr(temp.movie_budget,start=7,stop=nchar(temp.movie_budget)))
    }
    if (paste(a[1],a[2],a[3],a[4],a[5],a[6],a[7],sep="")=="&pound;"){
      temp.movie_budget=paste("GBP",substr(temp.movie_budget,start=8,stop=nchar(temp.movie_budget)))
    }
    remove(a)
  }
  if (length(temp.movie_budget)==0){
    temp.movie_budget="-"
  }
  
  #8. opening----
  temp.movie_opening=temp[grep("<h4 class=\"inline\">Opening Weekend USA",temp)]
  if (length(temp.movie_opening)==1){
    temp.movie_opening=strsplit(temp.movie_opening,split=">")[[1]][3]
    temp.movie_opening=strsplit(temp.movie_opening,split=" ")[[1]][2]
    a=strsplit(temp.movie_opening,split="")[[1]]
    if (a[length(a)]==","){
      temp.movie_opening=substr(temp.movie_opening,start=1,stop=nchar(temp.movie_opening)-1)
    }
    remove(a)
  }
  if (length(temp.movie_opening)==0){
    temp.movie_opening="-"
  }
  
  #9. gross----
  temp.movie_gross=temp[grep("<h4 class=\"inline\">Gross",temp)]
  if (length(temp.movie_gross)==1){
    temp.movie_gross=strsplit(temp.movie_gross,split=">")[[1]][3]
    temp.movie_gross=strsplit(temp.movie_gross,split=" ")[[1]][2]
    a=strsplit(temp.movie_gross,split="")[[1]]
    if (a[length(a)]==","){
      temp.movie_gross=substr(temp.movie_gross,start=1,stop=nchar(temp.movie_gross)-1)
    }
    remove(a)
  }
  if (length(temp.movie_gross)==0){
    temp.movie_gross="-"
  }
  
  #10. worldwide gross----
  temp.movie_worldwide_gross=temp[grep("<h4 class=\"inline\">Cumulative",temp)]
  if (length(temp.movie_worldwide_gross)==1){
    temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=">")[[1]][3]
    temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=" ")[[1]][2]
    a=strsplit(temp.movie_worldwide_gross,split="")[[1]]
    if (a[length(a)]==","){
      temp.movie_worldwide_gross=substr(temp.movie_worldwide_gross,start=1,stop=nchar(temp.movie_worldwide_gross)-1)
    }
    remove(a)
  }
  if (length(temp.movie_worldwide_gross)==0){
    temp.movie_worldwide_gross="-"
  }
  
  #11. result----
  return(c(temp.movie_title,temp.movie_year,temp.movie_content_rating,temp.movie_user_rating,temp.movie_num_rater,temp.movie_genre,temp.movie_budget,temp.movie_opening,temp.movie_gross,temp.movie_worldwide_gross))
}

#Collecting data----
movie_title=c()
movie_year=c()
movie_content_rating=c()
movie_user_rating=c()
movie_num_rater=c()
movie_genre=c()
movie_budget=c()
movie_opening=c()
movie_gross=c()
movie_worldwide_gross=c()
for (i in 1:250){
  temp.target.info=get.target.info(movie_link[i])
  movie_title=c(movie_title,temp.target.info[1])
  movie_year=c(movie_year,temp.target.info[2])
  movie_content_rating=c(movie_content_rating,temp.target.info[3])
  movie_user_rating=c(movie_user_rating,temp.target.info[4])
  movie_num_rater=c(movie_num_rater,temp.target.info[5])
  movie_genre=c(movie_genre,temp.target.info[6])
  movie_budget=c(movie_budget,temp.target.info[7])
  movie_opening=c(movie_opening,temp.target.info[8])
  movie_gross=c(movie_gross,temp.target.info[9])
  movie_worldwide_gross=c(movie_worldwide_gross,temp.target.info[10])
}

#Visulization----
library(knitr)
y=data.frame(movie_rank,movie_title,movie_year,movie_content_rating,movie_user_rating,movie_num_rater,movie_genre,movie_budget,movie_opening,movie_gross,movie_worldwide_gross)
y$movie_rank=as.character(movie_rank)
y$movie_title=as.character(movie_title)
y$movie_year=as.character(movie_year)
y$movie_content_rating=as.character(movie_content_rating)
y$movie_user_rating=as.character(movie_user_rating)
y$movie_num_rater=as.character(movie_num_rater)
y$movie_genre=as.character(movie_genre)
y$movie_budget=as.character(movie_budget)
y$movie_opening=as.character(movie_opening)
y$movie_gross=as.character(movie_gross)
y$movie_worldwide_gross=as.character(movie_worldwide_gross)
kable(y,align="c",col.names=c("Rank","Title","Year","Content Rating","User Rating","Number of Rater","Genre","Budget","Opening Weekend USA","Gross USA","Cumulative Worldwide Gross"))